GWAS

Notes: look for sex-antagonism or specifically, try and identify SNPs that affect one sex in one direction and the other sex in the opposite direction.

The preparation of data is adapted from Holman and Wong’s DGRP GWAS of fitness in different contexts. See their associated workflowr report for a comprehensive breakdown of their analysis.

Load packages

Code
library(tidyverse) # tidy coding, ggplot etc
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(glue) # for coding within strings
library(bigsnpr) # to install:   devtools::install_github("privefl/bigsnpr")
Loading required package: bigstatsr
Code
library(pander) # for tables
library(future.apply) # for parallel code running
Loading required package: future
Code
library(brms) # for bayesian models
Loading required package: Rcpp
Loading 'brms' package (version 2.20.4). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

Attaching package: 'brms'

The following object is masked from 'package:stats':

    ar
Code
library(ggtext) # for markdown syntax in ggplot
library(MetBrewer) # for more colour palettes
library(patchwork) # building composite plots
library(DT) # for nice tables
library(ggrepel)
library(pheatmap)

# build a helper function that produces a table to display our data

# Create a function to build HTML searchable tables

my_data_table <- function(df){
  datatable(
    df, rownames=FALSE,
    autoHideNavigation = TRUE,
    extensions = c("Scroller",  "Buttons"),
    options = list(
      autoWidth = TRUE,
      dom = 'Bfrtip',
      deferRender=TRUE,
      scrollX=TRUE, scrollY=1000,
      scrollCollapse=TRUE,
      buttons =
        list('pageLength', 'colvis', 'csv', list(
          extend = 'pdf',
          pageSize = 'A4',
          orientation = 'landscape',
          filename = 'Lifespan_data')),
      pageLength = 100
    )
  )
}

Load in DGRP variant annotations

The annotation text file was downloaded from the DGRP website: .

We use the site class of each variant for the downstream mutational burden analysis.

Code
options(future.globals.maxSize = 2000 * 1024 ^ 2, 
        stringsAsFactors = FALSE)

# Helper function to split a vector into chunks 
chunker <- function(x, max_chunk_size) split(x, ceiling(seq_along(x) / max_chunk_size))

get_variant_annotations <- function(){
  
  # Load up the big annotation file, get pertinent info. It's stored in some sort of text string format
  annot <- read.table("data/Input/dgrp.fb557.annot.txt", header = FALSE, stringsAsFactors = FALSE)
  
  get.info <- function(rows){
    lapply(rows, function(row){
      site.class.field <- strsplit(annot$V3[row], split = "]")[[1]][1]
      num.genes <- str_count(site.class.field, ";") + 1
      output <- cbind(rep(annot$V1[row], num.genes), 
                      do.call("rbind", lapply(strsplit(site.class.field, split = ";")[[1]], 
                                              function(x) strsplit(x, split = "[|]")[[1]])))
      if(ncol(output) == 5) return(output[,c(1,2,4,5)]) # only return SNPs that have some annotation. Don't get the gene symbol
      else return(NULL)
    }) %>% do.call("rbind", .)
  }
  
  plan("multisession")
  variant.details <- future_lapply(chunker(1:nrow(annot), max_chunk_size = 10000), get.info) %>% 
    do.call("rbind", .) %>% as.data.frame()
  
  names(variant.details) <- c("SNP", "FBID", "site.class", "distance.to.gene")
  variant.details$FBID <- unlist(str_extract_all(variant.details$FBID, "FBgn[:digit:]+")) # clean up text strings for Flybase ID
  variant.details %>%
    dplyr::filter(site.class != "FBgn0003638") %>% # NB this is a bug in the DGRP's annotation file
    mutate(chr = str_remove_all(substr(SNP, 1, 2), "_")) # get chromosome now for faster sorting later
}

if(!file.exists("data/Derived/annotations.csv")){
get_variant_annotations() %>% 
write_csv("data/Derived/annotations.csv")
} else annotations <- read_csv("data/Derived/annotations.csv")
Rows: 4622971 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): SNP, FBID, site.class, chr
dbl (1): distance.to.gene

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Building the dataset

Here’s the raw data from each study. This was then passed here and later returned…

Code
# Load genotyped lines reported in Huang et al for cross-matching and filtering

genotyped_lines <- read_csv("data/Input/Genotyped_lines.csv") %>% 
  mutate(line = as.factor(line),
         Genotyped = "YES")
Rows: 205 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (1): line

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
# Load the lifespan datasets 

Arya_raw_data <- read_csv("data/Input/Raw_data/Arya_2010.csv") %>% 
  mutate(Sex = if_else(Sex == "F", "Female", "Male")) %>% 
  mutate(line = as.factor(line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 2,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 13438 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (7): line, Vial_ID, Lifespan, Block, Temperature, Density, Prop_female
lgl (1): Diet

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Arya_lines <- Arya_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>%  nrow() # differs from Ivanov because they collected a little bit more data in that study and combined with Ivanov. Those data were not emailed.
Joining with `by = join_by(line)`
Code
Huang_raw_data <- read_csv("data/Input/Raw_data/Huang_2020.csv") %>% 
  select(-Vial_ID2) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Study = "Huang_2020",
         Adult_age_at_start = 1,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 70209 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Vial_ID2, Mated
dbl (8): Line, Vial_ID, Lifespan, Block, Temperature, Prop_female, Density, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_lines <- Huang_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow() # perfect match with study
Joining with `by = join_by(line)`
Code
Wilson_raw_data <- read_csv("data/Input/Raw_data/Wilson_2020.csv") %>% 
  rename(Vial_ID = Replicate,
         Block = Week) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 2,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 67605 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (8): Line, Lifespan, Week, Replicate, Diet, Temperature, Prop_female, De...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Wilson_lines <- Wilson_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>%  nrow()
Joining with `by = join_by(line)`
Code
Durham_raw_data <- read_csv("data/Input/Raw_data/Durham_2014.csv") %>% 
  select(1:4, 6, 14:18) %>% 
  rename(Vial_ID = Fly_ID) %>% 
  mutate(Sex = "Female") %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 0,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 4132 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (8): Week_1_fecundity, Week_3_fecundity, Week_5_fecundity, Week_7_fecun...
dbl (10): Fly_ID, Block, Diet, Line, Thorax_length, Lifespan, Lifetime_fecun...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Durham_lines <- Durham_raw_data %>% distinct(line) %>% anti_join(genotyped_lines) %>%  nrow()
Joining with `by = join_by(line)`
Code
Patel_raw_data <- read_csv("data/Input/Raw_data/Patel_2021.csv") %>% 
  left_join(read_csv("data/Input/Raw_data/Patel_2021.csv") %>% distinct(Line) %>% mutate(Vial_ID = 1:n())) %>% 
  mutate(Diet = 4,
         line = str_remove(Line, "DGRP "),
         line = as.factor(line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 2,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 3860 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Line, Study, Sex, Mated
dbl (5): Lifespan, Block, Temperature, Prop_female, Density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 3860 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Line, Study, Sex, Mated
dbl (5): Lifespan, Block, Temperature, Prop_female, Density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(Line)`
Code
# follow up

Patel_lines <- Patel_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
Dick_raw_data <- read_csv("data/Input/Raw_data/Dick_2011_tidy.csv") %>% 
  select(-Replicate) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 0,
         Social_group_size = Density,
         Density = Density / 150,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 15397 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Mated, Study
dbl (9): Line, Block, Lifespan, Replicate, Prop_female, Temperature, Diet, D...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Dick_lines <- Dick_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
Hoffman_raw_data <- read_csv("data/Input/Raw_data/tidy_Hoffman_2021.csv") %>% 
  rename(Vial_ID = Vial) %>% 
  mutate(Sex = if_else(Sex== "F", "Female", "Male")) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 5,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 8478 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (8): Line, Lifespan, Vial, Block, Diet, Temperature, Prop_female, Density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Hoffman_lines <- Hoffman_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
Zhao_raw_data <- read_csv("data/Input/Raw_data/tidy_Zhao_2022.csv") %>% 
  rename(Vial_ID = Vial) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(10),
         Adult_age_at_start = 0, # they were 3 days old upon entrance, but this is already accounted for in the death day measure
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 2023 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Sex, Study, Mated, Diet
dbl (7): Block, Line, Lifespan, Vial, Temperature, Density, Prop_female

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Zhao_lines <- Zhao_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
# combine them into a single tibble

All_raw_data <- 
  bind_rows(Arya_raw_data,
            Huang_raw_data,
            Wilson_raw_data,
            Durham_raw_data,
            Patel_raw_data,
            Dick_raw_data,
            Hoffman_raw_data,
            Zhao_raw_data) %>% 
  filter(!is.na(Lifespan)) %>% 
  rename(Yeast_percentage = Diet) %>% 
  left_join(genotyped_lines, by = "line") %>% 
    mutate(Genotyped = if_else(is.na(Genotyped), "NO", Genotyped),
           Density = round(Density, 3))

# write_csv(All_raw_data, file ="data/Input/all_raw_data.csv")

#my_data_table(All_raw_data)

Column explanations

line: the DGRP line (genotype)

Sex: is the fly a female or a male?

Vial_ID: in many of the studies, multiple flies were kept together in the same environment. It is common practice to record which flies shared the same environment, because a common environment can explain why some flies might exhibit similar lifespan e.g. if humidity is slightly lower in one environment than another, an increased risk of desiccation is shared by all flies in that environment. It is called Vial_ID because the most common way to house Drosophila is in cylindrical vials.

Lifespan: the number of days the fly survived as an adult.

Adult_age_at_start: often, a few days elapsed before adult flies entered the lifespan assay. Thus, for comparison across studies, it is important to add these days to the Death_day count to find the true adult lifespan.

Death_day: the number of days the fly survived as an adult, that were tracked by the lifespan assay.

Block: because of the large number of DGRP lines, often studies split their experiment into separate blocks, each of which assays a fraction of the total number of lines. These blocks are temporally separated, and generally deal with different generations of flies. Note that blocks are rarely balanced with respect to line e.g. all the sampling of line 10 is done in block 1.

Study: the study that first reports the data.

Temperature: the temperature, in degrees Celsius, that the lifespan assay was completed at.

Yeast_percentage: the overall % of the food recipe made up by brewers yeast. This is a common ingredient in Drosophila food, and the key source of protein. Several studies in our dataset were specifically interested in restriction of protein availability as a method to extend lifespan.

Social_group_size: the number of flies housed together in a vial (or bottle), at the onset of the fitness assay. As individuals died, social group sizes decreased.

Density: the number of flies per cm^3 of available space. Calculated as Social_group_size / 47cm^3 for Drosophila vials and Social_group_size / 150cm^3 for Drosophila bottles.

Mated: across our identified studies, three mating states were used in the lifespan assays. Flies were either kept as virgins for their entire lives (Mated = NO), were allowed to mate for a short period prior to the lifespan assay (Mated = Prior to assay), or kept in mixed sex groups and thus allowed to mate throughout their lives.

Prop_female: the proportion of females kept within each environment.

Genotyped: indicates whether the DGRP line has sequence data available. Most do, but a few lines included in the early studies (pre-2012) did not end up getting genotyped.

Genetic analysis of life expectancy and lifespan inequality

In part 1 of this study, we calculated mean values and standard error for each combination of line, sex, temperature and mating status. These data are displayed, and can be downloaded from, the below table. Note that for GWA and other SNP based analysis, we removed lines that had not been genotyped (shown as Genotyped = NO).

Code
full_dataset <- 
  read_csv("data/input/line_means.csv") %>% 
  mutate(line = as.factor(Line),) %>%
  select(-Line) %>% 
  left_join(genotyped_lines, by = "line") %>% 
  mutate(Genotyped = if_else(is.na(Genotyped), "NO", Genotyped))
Rows: 2815 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): Sex, Mated, Study, Density_cat, best_mod
dbl (12): Line, Temperature, Block, e0, SE_e0, h, SE_h, samp, a, b, a_SE, b_SE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
my_data_table(full_dataset)

For CPASSOC, we split the data into their respective treatments. For the purposes of this analysis, we follow a common convention in quantitative genetics and consider each lifespan measurement a separate trait e.g. lifespan at 25 degrees is a separate trait from lifespan at 18 degrees.

Code
Arya_2010_f <-
full_dataset %>% 
  filter(Study == "Arya_2010" & Sex == "Female" & Genotyped == "YES")

Arya_2010_m <-
full_dataset %>% 
  filter(Study == "Arya_2010" & Sex == "Male" & Genotyped == "YES")

Huang_2020_f_18 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Female" & Temperature == 18)

Huang_2020_m_18 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Male" & Temperature == 18)

Huang_2020_f_25 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Female" & Temperature == 25)

Huang_2020_m_25 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Male" & Temperature == 25)

Huang_2020_f_28 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Female" & Temperature == 28)

Huang_2020_m_28 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Male" & Temperature == 28)

Wilson_2020_f <-
  full_dataset %>% 
  filter(Study == "Wilson_2020") %>% 
  distinct(line, .keep_all = TRUE)

Durham_2014_f <-
  full_dataset %>% 
  filter(Study == "Durham_2014") %>% 
  distinct(line, .keep_all = T)

Patel_2021_f <-
  full_dataset %>% 
  filter(Study == "Patel_2021")

Finally, we also load the residual line means for each sex, which are the line means after controlling for temperature, mating status and the Study_ID random effect.

Code
residual_means <- 
  read_csv("data/Input/MCMC_residuals.csv") %>% 
  mutate(line = as.factor(line)) %>% 
  inner_join(genotyped_lines) %>% 
  rename(e0 = res_e0,
         h = res_h)
Rows: 659 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Sex
dbl (3): line, res_e0, res_h

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(line)`

Performing GWAS

Install neccessary software and build helper functions

In addition to the R packages we load, plink 1.9 and beagle must also be installed. These software packages allow us to wrangle the data into the correct format and impute missing values, both of which are required to run GWAS with the Gemma R package.

Code
# build a function to prepare data for GWAS

prep_for_e0_GWAS <- function(data, sex){
data %>% 
  #inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>%  # filter to lines that have been genotyped
  mutate(line = glue("line{line}")) %>% 
  select(line, e0)
}

prep_for_h_GWAS <- function(data, sex){
data %>% 
  #inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>%  # filter to lines that have been genotyped
  mutate(line = glue("line{line}")) %>% 
  select(line, h)
}

# I used bigsnpr::download_plink(dir = "code") which puts it in the code folder - I'm using a windows operating system.

# Beagle is a software package for phasing genotypes and imputing ungenotyped markers.

plink <- paste(getwd(), "code/plink", sep = "/")
beagle <- bigsnpr::download_beagle()


# # Load the wolbachia infection status data from the DGRP website
# wolbachia <- read_csv("data/input/wolbachia.csv") %>% 
#   mutate(line = paste("line", line, sep = ""))
# 
# # Load the chomosomal inversion data from the DGRP website
# # these are the 5 inversions that Huang et al. PNAS corrected for
# inversions <- read_csv("data/input/inversion genotypes.csv") %>%
#     mutate(line = paste("line", line, sep = "")) %>%
#     select(line, `In(2L)t`, `In(2R)NS`, `In(3R)P`, `In(3R)K`, `In(3R)Mo`) 

# helper function to pass commands to the terminal
# Note that we set `intern = TRUE`, and pass the result of `system()` to `cat()`, 
# ensuring that the Terminal output will be printed in this knitr report.
# 
# This is the mac OS function
 
#run_command <- function(shell_command, wd = getwd(), path = ""){
#  cat(system(glue("cd ", wd, path, # tell terminal which directory to work in
#                  "\n",shell_command), # on a second terminal line, run the plink command
#             intern = TRUE), sep = '\n')  
#}

# This is the windows function 

run_command <- function(plink_command, wd = getwd(), path = "") {
  # Specify the full path to the plink executable within the 'code' subdirectory.
  plink_path <- paste(getwd(), "code/plink", sep = "/")
  
  # Create the full shell command with the plink executable.
  command <- glue("cmd.exe /c cd /d {shQuote(file.path(wd, path))} && {shQuote(plink_path)} {plink_command}")
  
  # Execute the combined command.
  result <- system(command, intern = TRUE)
  
  # Print the result.
  cat(result, sep = '\n')
  
  # Return the result as a character vector.
  return(result)
}

# sometimes we need to run terminal commands without plink - create a slightly different function to do this

run_command_no_plink <- function(shell_command, wd = getwd(), path = "") {
  
  # Create the full shell command with the plink executable.
  command <- glue("cmd.exe /c cd /d {shQuote(file.path(wd, path))} && {shell_command}")
  
  # Execute the combined command.
  result <- system(command, intern = TRUE)
  
  # Print the result.
  cat(result, sep = '\n')
  
  # Return the result as a character vector.
  return(result)
}

Build a function to create manhattan plots for later

Code
build_manhattan_plot <- function(gwas_results){
  
  manhattan_data <- gwas_results %>%
    mutate(position = str_split(SNP, "_"), # split the SNP name into logical bits
           chr = map_chr(position, ~ .x[1]), # the first bit is the chromosome arm - name the column appropriately
           position = as.numeric(map_chr(position, ~ .x[2])), # where on the chromosome is the SNP found
           pval = -1 * log10(P)) %>% # make visualising the p values easier
    select(chr, position, pval)
  
  # this next chunk finds convenient cuts for labels and colour changes 
  
  max_pos <- manhattan_data %>%
    group_by(chr) %>%
    summarise(
      max_pos = max(position), 
      middle_pos = (max_pos - min(position)) / 2,
      .groups = "drop") %>%
    as.data.frame()
  
  max_pos$max_pos <- c(0, cumsum(max_pos$max_pos[1:5]))
  
  max_pos$label_pos <- max_pos$max_pos + max_pos$middle_pos
  
  # combine the two dataframes created above
  
  manhattan_data <- manhattan_data %>%
    left_join(max_pos, by = "chr") %>%
    mutate(position = position + max_pos,
           chromosome_colour = case_when(chr == "2L" | chr == "3L" | chr == "4" ~ "A",
                                         .default = "B"),
           Notable = if_else(pval >= 8, "YES", "NO"))
  
  plot <- manhattan_data %>%
    #filter(Notable == "NO") %>% 
    ggplot(aes(position, pval, group = chr, stroke = 0.01)) +
    geom_point(aes(colour = chromosome_colour), size = 1.6) +
    geom_hline(yintercept = 8, linetype = 2, colour = "red", linewidth = 1.2) +
    scale_colour_manual(values = c(met.brewer(name = "Hokusai3")[3], met.brewer(name = "Hokusai3")[6])) +
    #geom_label_repel(data = manhattan_data %>% filter(common_SNP == "YES" &),  
     #              aes(label = position, fill = "aliceblue")) +
    #geom_point(data = manhattan_data %>% filter(Notable == "YES"),
     #          fill = "aliceblue", size = 3, 
      #         shape = 21, colour = "grey2") +
    scale_x_continuous(breaks = max_pos$label_pos, labels = max_pos$chr) +
    scale_y_continuous(expand = c(0,0), limits = c(0, 40)) +
    ylab("-log~10~(_p_)") + 
    xlab("Chromosome and position") +
    theme_classic() +
    theme(legend.position = "none",
          axis.title.y = element_markdown(size = 18),
          axis.title.x = element_markdown(size = 18),
          axis.text.x = element_text(size = 15),
          axis.text.y = element_text(size = 15))  
}

Perform SNP quality control and impute missing data

We cleaned up the DGRP’s .bed/.bim/.fam files (available from the Mackay lab website) by:

  1. Removing any SNPs for which genotypes are missing for >10% of the DGRP lines. We then use the software Beagle to impute the remaining missing genotypes. Imputation takes a long time so ideally, you only want to do it once.

  2. Removing SNPs with a minor allele frequency of less than 5%. We have negilible power to detect associations for rare SNPs below this threshold.

In the PLINK-formatted genotype files, lines fixed for the major allele are coded as 2, and lines fixed for the minor allele as 0. This means that in the association tests we calculate, negative effect sizes mean that the minor allele is associated with lower trait values, while positive effect sizes means that the minor allele is associated with higher trait values.

Code
Run_function <- FALSE # Change this to TRUE to run the models

# If the imputation is not already done, create the following 3 files of imputed genotype data: 
# dgrp2_QC_all_lines_imputed_correct.bed/bim/fam
if(!file.exists("data/Derived/dgrp2_QC_all_lines_imputed_correct.bed")) {
  perform_SNP_QC_and_imputation(phenotypes = predicted_line_means)
}

if(Run_function){
  
  # Use Plink to clean and subset the DGRP's SNP data as follows:
  # Only keep SNPs for which at least 90% of DGRP lines were successfully genotyped (--geno 0.1)
  # Only keep SNPs with a minor allele frequency of 0.05 or higher (--maf 0.05)
  # Finally, write the processed BIM/BED/FAM files to the data/derived directory
  
  output_directory <-  paste(getwd(), "data/Derived", sep = "/")
  
  run_command(glue("--bfile dgrp2",
                   " --geno 0.1 --maf 0.05 --allow-no-sex", 
                   " --make-bed --out {shQuote(output_directory)}/dgrp2_QC_all_lines"), path = "data/Input/")
  
  # Use the shell command 'sed' to remove underscores from the DGRP line names in the .fam file (e.g. 'line_120' becomes 'line120')
  # Otherwise, these underscores cause trouble when we need to convert from PLINK to vcf format (vcf format uses underscore as a separator)
  # I manually edited the text file to do this rather than using the below command, which I don't have quite working
  #for(i in 1:2) run_command_no_plink("sed -i '' 's/_//' dgrp2_QC_all_lines.fam", path = "/data/Derived/")
                
  
  # Now impute the missing genotypes using Beagle
  # This part uses the data for the full DGRP panel of >200 lines, to infer missing genotypes as accurately as possible. 
  # This step uses a lot of memory (max memory was set to 28MB in prior analysis, and it used 26.5GB), but maybe it can also run on a less powerful computer?
  # The bigsnpr package provides a helpful wrapper for Beagle called snp_beagleImpute(): it translates to a VCF file and back again using PLINK
  snp_beagleImpute(beagle, plink, 
                   bedfile.in = "data/Derived/dgrp2_QC_all_lines.bed", 
                   bedfile.out = "data/Derived/dgrp2_QC_all_lines_imputed.bed",
                   ncores = 4, 
                   memory.max = 16)
  
  # assign a sex of 'female' to all the DGRP lines (Beagle removes the sex, and it seems PLINK needs individuals to have a sex, 
  # despite PLINK having a command called --allow-no-sex)
  run_command("sed -i '' 's/    0   0   0/  0   0   2/' dgrp2_QC_all_lines_imputed.fam", path = "/data/Derived/")
  
  # Re-write the .bed file, to make sure the MAF threshold and minor/major allele designations are correctly assigned post-Beagle
  run_command(glue("--bfile dgrp2_QC_all_lines_imputed",
                   " --geno 0.1 --maf 0.05 --allow-no-sex", 
                   " --make-bed --out dgrp2_QC_all_lines_imputed_correct"), path = "/data/Derived/")
  #unlink(list.files("data/derived", pattern = "~", full.names = TRUE)) # delete the original files, which were given a ~ file name by PLINK
} else "Already run"
[1] "Already run"

Build GWAS function

Note: because PLINK defines the minor allele as the alt allele (so, lines fixed for the minor allele are scored as genotype: 2, and those with the major allele as genotype: 0), a positive effect size in these association tests means the minor allele is associated with a higher value of the trait in question.

Code
run_GWAS <- function(phenotypes){
  
  # Make a list of the lines in our sample and save as a text file for passing to PLINK
  lines_to_keep <- phenotypes %>% select(line) %>% mutate(line_2 = line)
  write.table(lines_to_keep, 
              row.names = FALSE, 
              col.names = FALSE, 
              file = "data/Derived/lines_to_keep.txt", 
              quote = FALSE)

  # Now cull the PLINK files to just the lines that we measured, and re-apply the 
  # MAF cut-off of 0.05 for the new smaller sample of DGRP lines
  run_command(glue("--bfile dgrp2_QC_all_lines_imputed_correct",
                   " --keep-allele-order", # force PLINK to retain the major/minor allele designations that apply to the DGRP as a whole
                   " --keep lines_to_keep.txt --geno 0.1 --maf 0.05", 
                   " --make-bed --out dgrp2_QC_focal_lines"), path = "/data/Derived/")
  
    # Define a function to add our phenotype data to a .fam file, which is needed for GWAS analysis and to make sure PLINK includes these samples
  # The 'phenotypes' data frame needs to have a column called 'line'
  add_phenotypes_to_fam <- function(filepath, phenotypes){
    read_delim(filepath, col_names = FALSE, delim = " ") %>% 
      select(X1, X2, X3, X4, X5) %>% # Get all the non-phenotype columns
      left_join(phenotypes, 
                by = c("X1" = "line")) %>%
      write.table(file = "data/Derived/dgrp2_QC_focal_lines_NEW.fam", 
                  col.names = FALSE, row.names = FALSE, 
                  quote = FALSE, sep = " ")
    
    unlink("data/Derived/dgrp2_QC_focal_lines.fam")
    file.rename("data/Derived/dgrp2_QC_focal_lines_NEW.fam", 
                "data/Derived/dgrp2_QC_focal_lines.fam")
  }
  
  # edit the new FAM file to add the phenotype data from 'phenotypes'
  add_phenotypes_to_fam("data/Derived/dgrp2_QC_focal_lines.fam", phenotypes)
  

  # Run mixed-model GWAS (in practice, the relatedness between almost all pairs of lines 
  # is sufficiently low that PLINK always instead chooses to run a linear model)
  
  run_command("--bfile dgrp2_QC_focal_lines  --assoc --maf 0.05", 
              path = "/data/Derived")
  
  # wrangle the GWAS results
  
  Focal_name <- deparse(substitute(phenotypes))
  
  gwas_results <- read.table("data/Derived/plink.qassoc", 
                             header = TRUE) %>% 
    select(SNP, BETA, SE, "T", P)

  # Save a file containing all SNPs with P < 1e-5:
  gwas_results %>% 
    filter(P < 1e-05) %>% 
    write_tsv(glue("data/Derived/GWAS_results/{Focal_name}_significant_SNPs.tsv.gz"))

  # Rename and compress the GWAS summary stats file 
  # The filter step means that only variants in the LD-pruned subset get saved to disk.
  gwas_results %>% 
    filter(SNP %in% (pull(read_tsv("data/Derived/plink.prune.in", col_names = "SNP"), SNP))) %>% 
    write_tsv(glue("data/Derived/GWAS_results/{Focal_name}.tsv.gz"))
  unlink("data/Derived/plink.qassoc")
  
  # Rename the plink log file
  file.rename("data/Derived/plink.log",
              glue("data/Derived/{Focal_name}_log.txt"))
  
  unlink("data/Derived/dgrp2_QC_focal_lines.bim")
  unlink("data/Derived/dgrp2_QC_focal_lines.bed")
  unlink("data/Derived/dgrp2_QC_focal_lines.fam")
  unlink("data/Derived/dgrp2_QC_focal_lines.log")
} 

Run GWAS

Code
Arya_f_l <- prep_for_e0_GWAS(Arya_2010_f)
Arya_m_l <- prep_for_e0_GWAS(Arya_2010_m)
Arya_f_h <- prep_for_h_GWAS(Arya_2010_f)
Arya_m_h <- prep_for_h_GWAS(Arya_2010_m)
Huang_f_18_l <- prep_for_e0_GWAS(Huang_2020_f_18)
Huang_f_18_h <- prep_for_h_GWAS(Huang_2020_f_18)
Huang_m_18_l <- prep_for_e0_GWAS(Huang_2020_m_18)
Huang_m_18_h <- prep_for_h_GWAS(Huang_2020_m_18)
Huang_f_25_l <- prep_for_e0_GWAS(Huang_2020_f_25)
Huang_f_25_h <- prep_for_h_GWAS(Huang_2020_f_25)
Huang_m_25_l <- prep_for_e0_GWAS(Huang_2020_m_25)
Huang_m_25_h <- prep_for_h_GWAS(Huang_2020_m_25)
Huang_f_28_l <- prep_for_e0_GWAS(Huang_2020_f_28)
Huang_f_28_h <- prep_for_h_GWAS(Huang_2020_f_28)
Huang_m_28_l <- prep_for_e0_GWAS(Huang_2020_m_28)
Huang_m_28_h <- prep_for_h_GWAS(Huang_2020_m_28)
Wilson_f_l <- prep_for_e0_GWAS(Wilson_2020_f)
Wilson_f_h <- prep_for_h_GWAS(Wilson_2020_f)
Durham_f_l <- prep_for_e0_GWAS(Durham_2014_f)
Durham_f_h <- prep_for_h_GWAS(Durham_2014_f)
Patel_f_l <- prep_for_e0_GWAS(Patel_2021_f)
Patel_f_h <- prep_for_h_GWAS(Patel_2021_f)

residual_f_l <- prep_for_e0_GWAS(residual_means %>% filter(Sex == "Female"))
residual_m_l <- prep_for_e0_GWAS(residual_means %>% filter(Sex == "Male"))
residual_f_h <- prep_for_h_GWAS(residual_means %>% filter(Sex == "Female"))
residual_m_h <- prep_for_h_GWAS(residual_means %>% filter(Sex == "Male"))

run_GWAS(Arya_f_l)
run_GWAS(Arya_m_l)
run_GWAS(Arya_f_h)
run_GWAS(Arya_m_h)
run_GWAS(Huang_f_18_l)
run_GWAS(Huang_f_18_h)
run_GWAS(Huang_m_18_l)
run_GWAS(Huang_m_18_h)
run_GWAS(Huang_f_25_l)
run_GWAS(Huang_f_25_h)
run_GWAS(Huang_m_25_l)
run_GWAS(Huang_m_25_h)
run_GWAS(Huang_f_28_l)
run_GWAS(Huang_f_28_h)
run_GWAS(Huang_m_28_l)
run_GWAS(Huang_m_28_h)
run_GWAS(Wilson_f_l)
run_GWAS(Wilson_f_h)
run_GWAS(Durham_f_l)
run_GWAS(Durham_f_h)
run_GWAS(Patel_f_l)
run_GWAS(Patel_f_h)

run_GWAS(residual_f_l)
run_GWAS(residual_m_l)
run_GWAS(residual_f_h)
run_GWAS(residual_m_h)

Add allele IDs and MAFs to annotation database

Code
# Extract and save the MAFs, SNP positions, and major/minor alleles
MAFs <- read.table("data/derived/plink.frq", 
                   header = TRUE, stringsAsFactors = FALSE) %>% 
  mutate(position = map_chr(
    strsplit(SNP, split = "_"), 
    function(x) x[2])) %>%
  select(SNP, position, MAF, A1, A2) %>% 
  rename(minor_allele = A1,
         major_allele = A2) 

MAFs <- annotations %>% 
  select(SNP, FBID, site.class, distance.to.gene, chr) %>% collect() %>%
  full_join(MAFs, by = "SNP") %>%
  filter(!is.na(MAF)) %>%
  mutate(site.class = replace(site.class, is.na(site.class), "INTERGENIC")) %>% 
  as_tibble()

# Delete the original variant annotation table from the db, and add the new one back in

Applying cross-phenotype meta-analysis

The power to detect variants associated with correlated phenotypes can be increased if a meta-analytic approach is adopted (Zhu et al, 2018). Here, we use the CPASSOC approach developed by Zhou et al (2015), which evaluates the null hypothesis that SNPs are associated with none of the considered traits, weighted by the sample size of each study and adjusted for the trait correlation matrix. The steps to apply CPMA are as follows:

  1. Estimate \(R\), the trait correlation matrix. In the DGRP, this can be done using SNP data or using line means.

  2. GWAS each trait separately (see above).

  3. Collate effect sizes for each trait into a vector \(\beta\), for each SNP.

  4. Use a Wald test statistic to estimate a vector of p-values, \(T\), from the \(\beta\) and \(\sigma\) estimates for each SNP.

  5. Test whether \(\beta\) = 0. If the trait data are homogeneous, use:

\[S_{Hom} = \frac{e^T(RW)^{-1}T(e^T(RW)^{-1}T)^T}{e^T(WRW)^{-1}e}\] where \(W\) is a diagonal matrix of weights for the individual test statistics (either the inverse of the variance or simply the sample size).

  1. If there is heterogeneity between trait measures, \(S_{Hom}\) is not appropriate. The ideal test statistic in this case would only include the cohorts and traits with a true contribution to the association of a genetic variant. To implement this, the value \(\tau\) is used as a threshold, below which traits are not included in the test statistic. This statistic, \(S_{Het}\) can be viewed as the maximum of the weighted sum of trait-specific test statistics satisfying different thresholds. To calculate \(S_{Het}\) first find,

\[S_{\tau} = \frac{e^T(R(\tau)W(\tau))^{-1}T(\tau)(e^T(R(\tau)W(\tau))^{-1}T(\tau))^T}{e^TW(\tau)^{-1}R(\tau)^{-1}W(\tau)^{-1}e}\] When \(\tau\) is large, \(S(\tau)\) can be undefined if the test statistic falls below \(\tau\) for all cohorts. In this case \(S(\tau) = 0\). Our test statistic is then

\[\displaystyle S_{Het} = \max_{r \gt 0} S(\tau)\]

Anyway, I’ve only included that for the mathematically inclined. Code to implement both statistics in R can be downloaded here, and is implemented below.

Generate the genetic correlation matrix

Using line means

Code
female_data <-
  bind_rows(Arya_2010_f,
            Huang_2020_f_18,
            Huang_2020_f_25,
            Huang_2020_f_28,
            Wilson_2020_f,
            Durham_2014_f,
            Patel_2021_f) %>% 
  select(line, Study, Sex, Temperature, e0, h) %>% 
  pivot_wider(values_from = c(e0, h), names_from = c(Study, Sex, Temperature)) 

female_data_e0 <-
  female_data %>% 
  select(contains("e0"))

female_data_h <-
  female_data %>% 
  dplyr::select(!contains("e0"), -line)

female_e0_corr_matrix <- cor(female_data_e0, use = "pairwise.complete.obs", method = "spearman")
female_h_corr_matrix <- cor(female_data_h, use = "pairwise.complete.obs", method = "spearman")

male_data <-
  bind_rows(Arya_2010_m,
            Huang_2020_m_18,
            Huang_2020_m_25,
            Huang_2020_m_28) %>% 
  select(line, Study, Sex, Temperature, e0, h) %>% 
  pivot_wider(values_from = c(e0, h), names_from = c(Study, Sex, Temperature)) 

male_data_e0 <-
  male_data %>% 
  select(contains("e0"))

male_data_h <-
  male_data %>% 
  select(!contains("e0"), -line)

male_e0_corr_matrix <- cor(male_data_e0, use = "pairwise.complete.obs", method = "spearman")
male_h_corr_matrix <- cor(male_data_h, use = "pairwise.complete.obs", method = "spearman")

all_data <- inner_join(female_data, male_data, by = "line")

all_data_e0 <- 
  all_data %>% 
  select(contains("e0"))

all_data_h <-
  all_data %>% 
  select(!contains("e0"), -line)

all_e0_corr_matrix <- cor(all_data_e0, use = "pairwise.complete.obs", method = "spearman")
all_h_corr_matrix <- cor(all_data_h, use = "pairwise.complete.obs", method = "spearman")

# install.packages("dendsort")
#library(dendsort)

#sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))

#mat_cluster_cols <- sort_hclust(mat_cluster_cols)
#plot(mat_cluster_cols, main = "Sorted Dendrogram", xlab = "", sub = "")

#pheatmap(all_e0_corr_matrix)
#pheatmap(all_h_corr_matrix)

Using SNP effect sizes

Code
#female_e0_SNP_corr_matrix <- 
 # cor(Female_e0_effect_sizes %>% select(-SNP), use = "pairwise.complete.obs", method = "spearman")

Perform CPASSOC analysis

Note that the MASS package is required to run the functions we load below. Unfortunately this clashes with dplyrs select(). So be prepared to run detach("package:MASS", unload=TRUE) to get some things to work if you’re moving back and forth throughout the code.

Code
source("FunctionSet.R") # saved as a separate file in the project. 

Attaching package: 'MASS'
The following object is masked from 'package:patchwork':

    area
The following object is masked from 'package:dplyr':

    select

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loading required package: compiler

e0, sexes combined

The purpose of this meta-analysis is to search for SNPs that have some effect on ageing, expressed in many different contexts (sexes, temperatures and mating status’).

To conduct CPASSOC for a given SNP, we need a summary statistic from each GWAS. A different number of lines were included in each GWAS, which can cause small differences in the number of SNPs assessed. We therefore prune the list of SNPs to those included in all univariate analyses. After pruning (and previous pruning to remove SNPs in linkage disequilibrium), 220,582 SNPs remain.

Code
# load GWAS results

Arya_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_1 = T)

Arya_m_l_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_m_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_2 = T)
  
Huang_f_18_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_18_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_3 = T)

Huang_m_18_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_18_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_4 = T)

Huang_f_25_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_25_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_5 = T)

Huang_m_25_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_25_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_6 = T)

Huang_f_28_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_28_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_7 = T)

Huang_m_28_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_28_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_8 = T)

Wilson_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Wilson_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Trait_9 = T)

Durham_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Durham_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Trait_10 = T)

Patel_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Patel_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Trait_11 = T)

all_e0_effect_sizes <-
  Arya_f_l_GWAS %>%
  inner_join(Arya_m_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_18_l_GWAS, by = "SNP") %>%
  inner_join(Huang_m_18_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_25_l_GWAS, by = "SNP") %>%
  inner_join(Huang_m_25_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_28_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_m_28_l_GWAS, by = "SNP") %>% 
  inner_join(Wilson_f_l_GWAS, by = "SNP") %>% 
  inner_join(Durham_f_l_GWAS, by = "SNP") %>% 
  inner_join(Patel_f_l_GWAS, by = "SNP") 

all_e0_effect_sizes_values <-
  all_e0_effect_sizes %>% 
  dplyr::select(2:12)

Sample_size_all <- c(165, 165, 183, 183, 186, 186, 177, 177, 161, 189, 193) 

S_hom <- SHom(all_e0_effect_sizes_values, Sample_size_all, all_e0_corr_matrix)

# calculate meta-p-values and bind the two together with the SNP names

p_S_hom <- pchisq(S_hom, df = 1, ncp = 0, lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_hom) %>% 
  rename(meta_p_hom = value, 
         S_hom = ...2)

# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)

# estimate parameters of gamma distribution

para <- EstimateGamma(N = 1E4, Sample_size_all, all_e0_corr_matrix);

S_het <- SHet(all_e0_effect_sizes_values, Sample_size_all, all_e0_corr_matrix)

# obtain P-values of S_Het using the estimated gamma parameters
  
p_S_het <- pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_het) %>% 
  rename(meta_p_het = value, 
         S_het = ...2)


all_e0_meta_results <- 
  all_e0_effect_sizes %>% 
  bind_cols(p_S_hom,
            p_S_het) # add the unadjusted p values

all_e0_meta_results <- read_csv("data/Derived/all_e0_meta_results.csv")

#write_csv(all_e0_meta_results, "data/Derived/all_e0_meta_results.csv")

h sexes combined

Code
# load GWAS results

Arya_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_1 = T)

Arya_m_h_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_m_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_2 = T)
  
Huang_f_18_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_18_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_3 = T)

Huang_m_18_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_18_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_4 = T)

Huang_f_25_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_25_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_5 = T)

Huang_m_25_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_25_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_6 = T)

Huang_f_28_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_28_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_7 = T)

Huang_m_28_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_28_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_8 = T)

Wilson_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Wilson_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Trait_9 = T)

Durham_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Durham_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Trait_10 = T)

Patel_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Patel_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Trait_11 = T)

all_h_effect_sizes <-
Arya_f_h_GWAS %>%
  inner_join(Arya_m_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_18_h_GWAS, by = "SNP") %>%
  inner_join(Huang_m_18_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_25_h_GWAS, by = "SNP") %>%
  inner_join(Huang_m_25_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_28_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_m_28_h_GWAS, by = "SNP") %>% 
  inner_join(Wilson_f_h_GWAS, by = "SNP") %>% 
  inner_join(Durham_f_h_GWAS, by = "SNP") %>% 
  inner_join(Patel_f_h_GWAS, by = "SNP") 

all_h_effect_sizes_values <-
  all_h_effect_sizes %>% 
  dplyr::select(2:12)

S_hom <- SHom(all_h_effect_sizes_values, Sample_size_all, all_h_corr_matrix)

# calculate meta-p-values and bind the two together with the SNP names

p_S_hom <- pchisq(S_hom, df = 1, ncp = 0, lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_hom) %>% 
  rename(meta_p_hom = value, 
         S_hom = ...2)

# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)

# estimate parameters of gamma distribution

para <- EstimateGamma(N = 1E4, Sample_size_all, all_h_corr_matrix);

S_het <- SHet(all_h_effect_sizes_values, Sample_size_all, all_h_corr_matrix)

# obtain P-values of S_Het using the estimated gamma parameters
  
p_S_het <- pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_het) %>% 
  rename(meta_p_het = value, 
         S_het = ...2)


all_h_meta_results <- 
  all_e0_effect_sizes %>% 
  bind_cols(p_S_hom,
            p_S_het) # add the unadjusted p values

#write_csv(all_h_meta_results, "data/Derived/all_h_meta_results.csv")

all_h_meta_results <- read_csv("data/Derived/all_h_meta_results.csv")

Plot the results

Code
detach("package:MASS", unload=TRUE)
Code
common_SNPs <- 
  all_e0_meta_results %>% filter(meta_p_het < 0.00000005) %>% 
  inner_join(all_h_meta_results %>% filter(meta_p_het < 0.00000005), by = "SNP") %>% 
  mutate(common_SNP = "YES") %>% 
  select(SNP, common_SNP)

e0_results <- 
  all_e0_meta_results %>% 
  select(SNP, meta_p_hom, meta_p_het) %>% 
  rename(P = meta_p_het) %>% 
  left_join(common_SNPs) %>% 
  mutate(common_SNP = if_else(is.na(common_SNP), "NO", common_SNP))
  

h_results <- 
  all_h_meta_results %>% 
  select(SNP, meta_p_hom, meta_p_het) %>% 
  rename(P = meta_p_het) %>% 
  left_join(common_SNPs) %>% 
  mutate(common_SNP = if_else(is.na(common_SNP), "NO", common_SNP))

# plot the results using the manhattan plot custom function we defined earlier

e0_het_plot <- build_manhattan_plot(e0_results) +
  labs(title = "Life expectancy") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))

h_het_plot <- build_manhattan_plot(h_results) +
  labs(title = "Lifespan equality") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))


e0_het_plot + h_het_plot

Female e0

Code
# Combine the GWAS results from each study in a wide format. inner_join allows us to limit the matrix to SNPs evaluated in all studies (not adhering to this will break CPASSOC)

Female_e0_effect_sizes <-
Arya_f_l_GWAS %>% 
  inner_join(Huang_f_18_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_25_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_28_l_GWAS, by = "SNP") %>% 
  inner_join(Wilson_f_l_GWAS, by = "SNP") %>% 
  inner_join(Durham_f_l_GWAS, by = "SNP") %>% 
  inner_join(Patel_f_l_GWAS, by = "SNP") 

Female_e0_effect_sizes_values <-
  Female_e0_effect_sizes %>% 
  dplyr::select(2:8)

# use the number of lines included as the weights input for now. 1/SE would be better and I think it should be possible

Sample_size <- c(165, 183, 186, 177, 161, 189, 193) 

# first calculate S_hom, which is very similar to typical fixed effect meta-analysis (and probably what we'll use in the paper)

S_hom <- SHom(Female_e0_effect_sizes_values, Sample_size, female_e0_corr_matrix)

# calculate meta-p-values and bind the two together with the SNP names

p_S_hom <- pchisq(S_hom, df = 1, ncp = 0, lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_hom) %>% 
  rename(meta_p_hom = value, 
         S_hom = ...2)

# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)

# estimate parameters of gamma distribution

para <- EstimateGamma(N = 1E4, Sample_size, female_e0_corr_matrix);

S_het <- SHet(Female_e0_effect_sizes_values, Sample_size, female_e0_corr_matrix)

# obtain P-values of S_Het using the estimated gamma parameters
  
p_S_het <- pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_het) %>% 
  rename(meta_p_het = value, 
         S_het = ...2)


e0_female_meta_results <- 
  Female_e0_effect_sizes %>% 
  bind_cols(p_S_hom,
            p_S_het) # add the unadjusted p values

Female lifespan inequality

Code
Arya_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_1 = T)
  
Huang_f_18_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_18_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_2 = T)

Huang_f_25_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_25_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_3 = T)

Huang_f_28_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_28_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_4 = T)

Wilson_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Wilson_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_5 = T)

Durham_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Durham_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_6 = T)

Patel_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Patel_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Trait_7 = T)

Female_h_effect_sizes <-
Arya_f_h_GWAS %>% 
  inner_join(Huang_f_18_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_25_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_f_28_h_GWAS, by = "SNP") %>% 
  inner_join(Wilson_f_h_GWAS, by = "SNP") %>% 
  inner_join(Durham_f_h_GWAS, by = "SNP") %>% 
  inner_join(Patel_f_h_GWAS, by = "SNP") 

Female_h_effect_sizes_values <-
  Female_h_effect_sizes %>% 
  dplyr::select(2:8)

# use the number of lines included as the weights input for now. 1/SE would be better and I think it should be possible

Sample_size <- c(165, 183, 186, 177, 161, 189, 193) 

S_hom <- SHom(Female_h_effect_sizes_values, Sample_size, female_h_corr_matrix)

# calculate meta-p-values and bind the two together with the SNP names

p_S_hom <- pchisq(S_hom, df = 1, ncp = 0, lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_hom) %>% 
  rename(meta_p_hom = value, 
         S_hom = ...2)

# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)

# estimate parameters of gamma distribution

para <- EstimateGamma(N = 1E4, Sample_size, female_h_corr_matrix);

S_het <- SHet(Female_h_effect_sizes_values, Sample_size, female_h_corr_matrix)

# obtain P-values of S_Het using the estimated gamma parameters
  
p_S_het <- pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_het) %>% 
  rename(meta_p_het = value, 
         S_het = ...2)


h_female_meta_results <- 
  Female_h_effect_sizes %>% 
  bind_cols(p_S_hom,
            p_S_het) 
Code
run_GWAS(residual_f_l)
run_GWAS(residual_m_l)
run_GWAS(residual_f_h)
run_GWAS(residual_m_h)

residual_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/residual_f_l.tsv.gz") 
residual_m_l_GWAS <- read_tsv("data/Derived/GWAS_results/residual_m_l.tsv.gz") 
residual_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/residual_f_h.tsv.gz") 
residual_m_h_GWAS <- read_tsv("data/Derived/GWAS_results/residual_m_h.tsv.gz") 

Make exploratory plots

Code
p1 <- 
  residual_f_l_GWAS %>% 
  rename(BETA_lifespan = BETA) %>% 
  mutate(Sex = "Female") %>% 
  select(SNP, BETA_lifespan, Sex) %>% 
  bind_rows(
    
    residual_m_l_GWAS %>% 
      rename(BETA_lifespan = BETA) %>% 
      mutate(Sex = "Male") %>% 
      select(SNP, BETA_lifespan, Sex)
  ) %>% 
  left_join(
    residual_f_h_GWAS %>% 
  rename(BETA_equality = BETA) %>% 
  mutate(Sex = "Female") %>% 
  select(SNP, BETA_equality, Sex) %>% 
  bind_rows(
    
    residual_m_h_GWAS %>% 
      rename(BETA_equality = BETA) %>% 
      mutate(Sex = "Male") %>% 
      select(SNP, BETA_equality, Sex)
  ),
by = c("SNP", "Sex")) %>% 
  
ggplot(aes(x = BETA_lifespan, y = BETA_equality)) +
  geom_point() +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75, colour = "salmon") +
  coord_cartesian(xlim = c(-4, 4), ylim = c(-0.1, 0.1)) +
  ylab("SNP effect on lifespan equality") +
  xlab("SNP effect on life expectancy") + 
  facet_wrap(~Sex) +
  theme_bw() +
  theme(text = element_text(size = 14),
        strip.background = element_blank(),
        strip.text = element_text(size = 16, hjust=0.5))
  
p2 <-  
residual_f_l_GWAS %>% 
  rename(BETA_lifespan = BETA) %>% 
  mutate(Sex = "Female") %>% 
  select(SNP, BETA_lifespan, Sex) %>% 
  bind_rows(
    
    residual_m_l_GWAS %>% 
      rename(BETA_lifespan = BETA) %>% 
      mutate(Sex = "Male") %>% 
      select(SNP, BETA_lifespan, Sex)
  ) %>% 
  left_join(
    residual_f_h_GWAS %>% 
  rename(BETA_equality = BETA) %>% 
  mutate(Sex = "Female") %>% 
  select(SNP, BETA_equality, Sex) %>% 
  bind_rows(
    
    residual_m_h_GWAS %>% 
      rename(BETA_equality = BETA) %>% 
      mutate(Sex = "Male") %>% 
      select(SNP, BETA_equality, Sex)
  ),
by = c("SNP", "Sex")) %>% 
  pivot_wider(values_from = c(BETA_lifespan, BETA_equality), names_from = Sex) %>% 
  
ggplot(aes(x = BETA_lifespan_Female, y = BETA_lifespan_Male)) +
  geom_point() +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75, colour = "salmon") +
  coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) +
  ylab("SNP effect on male life expectancy") +
  xlab("SNP effect on female life expectancy") + 
  theme_bw() +
  theme(text = element_text(size = 14),
        strip.background = element_blank(),
        strip.text = element_text(size = 16, hjust=0.5))

p1 / p2

Testing whether lifespan is a polygenic trait

Code
sim_data <- 
  tibble(draw_1 = rnorm(224164, 0, 1),
         draw_2 = rnorm(224164, 0, 1)) %>% 
   arrange(draw_1) %>%
  mutate(bin = c(rep(1:floor(n()/100), each = 100), 
                 rep(floor(n()/100) + 1, each = n() %% 100))) %>%
  group_by(bin) %>%
  summarise(draw_1 = mean(draw_1), draw_2 = mean(draw_2))

boyle_plot_sim <- 
  sim_data %>%
  ggplot(aes(draw_1, draw_2)) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_point(alpha = 0.8, size = 2.2) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75) +
  coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) +
  xlab("Mean effect size \n (random draw 1)") +
  ylab("Mean effect size \n (random draw 2)") + 
  theme_bw() +
  theme(strip.background = element_blank(),
        strip.text = element_text(hjust=0)) + 
  theme(text = element_text(size = 14))


real_data <-
  Female_GWAS %>% 
  select(SNP, BETA) %>% 
  rename(beta_female = BETA) %>% 
  left_join(Male_GWAS %>% 
              select(SNP, BETA) %>% 
              rename(beta_male = BETA),
            by = "SNP") %>% 
  filter(!is.na(beta_female) & !is.na(beta_male)) %>%  # remove NAs
  arrange(beta_female) %>%
  mutate(bin = c(rep(1:floor(n()/100), each = 100), 
                 rep(floor(n()/100) + 1, each = n() %% 100))) %>%
  group_by(bin) %>%
  summarise(females = mean(beta_female), males = mean(beta_male))

boyle_plot <- 
  real_data %>%
  ggplot(aes(females, males)) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_point(alpha = 0.8, size = 2.2) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75) +
  coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) +
  xlab("Mean effect size on female mean lifespan") +
  ylab("Mean effect size on male mean lifespan") + 
  theme_bw() +
  theme(strip.background = element_blank(),
        strip.text = element_text(hjust=0)) + 
  theme(text = element_text(size = 14))


boyle_plot_sim /
boyle_plot +
  plot_annotation(tag_levels = 'a')

Estimating the genetic correlation between mean lifespan and lifespan inequality

XX

Effect of mutational burden on lifespan

In this analysis we identify mutations that are likely to be deleterious, count the number of these putatively deleterious mutations per DGRP line, and test whether there is a negative association between mutational burden and lifespan.

The unguarded X

  • The homozygous nature of the DGRP makes this difficult.

  • But we can test if the X is purged of male-harming mutations more-so than female-harming mutations - that is, test the ‘faster X’ hypothesis.

Data preparation

We classified mutations as putatively deleterious if:

  • They are rare in the DGRP (<5% minor allele frequency)

  • They occur in a coding region

  • They have an effect that is likely to incur some functional change.

Following Holman and Wong (2023), we use the SnpEff designation of variants with major effects as those likely to incur a functional change. These variants are:

  • EXON_DELETED

  • NON_SYNONYMOUS_CODING

  • FRAME_SHIFT

  • CODON_CHANGE

  • CODON_INSERTION

  • CODON_CHANGE_PLUS_CODON_INSERTION

  • CODON_DELETION

  • CODON_CHANGE_PLUS_CODON_DELETION

  • STOP_GAINED

  • STOP_LOST

  • RARE_AMINO_ACID

  • START_LOST

  • START_GAINED

Code
# Load the DGRP genotype data using bigsnpr

if(!file.exists("snp_backing_file.bk")) {
rds <- snp_readBed("data/Input/dgrp2.bed", backingfile = "snp_backing_file")
}

bed_file <- snp_attach("snp_backing_file.rds")

annotations <- read.table("data/Input/dgrp.fb557.annot.txt", 
                    header = FALSE, stringsAsFactors = FALSE)

# Use PLINK to get the allele frequencies across the 205 DGRP lines

run_command("--bfile dgrp2  --freq", path = "/data/Input")

major_effect_types <- c(
  "EXON_DELETED", "NON_SYNONYMOUS_CODING",
  "FRAME_SHIFT", "CODON_CHANGE",
  "CODON_INSERTION",
  "CODON_CHANGE_PLUS_CODON_INSERTION",
  "CODON_DELETION",
  "CODON_CHANGE_PLUS_CODON_DELETION",
  "STOP_GAINED",
  "STOP_LOST",
  "RARE_AMINO_ACID",
  "START_LOST",
  "START_GAINED"
)

# this chunk detects and filters for any of the above variant types in the annotation dataset

all_major_mutations <- annotations$V1[unique(c(
  which(str_detect(annotations$V3, major_effect_types[1])),
  which(str_detect(annotations$V3, major_effect_types[2])),
  which(str_detect(annotations$V3, major_effect_types[3])),
  which(str_detect(annotations$V3, major_effect_types[4])),
  which(str_detect(annotations$V3, major_effect_types[5])),
  which(str_detect(annotations$V3, major_effect_types[6])),
  which(str_detect(annotations$V3, major_effect_types[7])),
  which(str_detect(annotations$V3, major_effect_types[8])),
  which(str_detect(annotations$V3, major_effect_types[9])),
  which(str_detect(annotations$V3, major_effect_types[10])),
  which(str_detect(annotations$V3, major_effect_types[11])),
  which(str_detect(annotations$V3, major_effect_types[12])),
  which(str_detect(annotations$V3, major_effect_types[13]))
))]

# now filter down to those with 0 < MAF < 0.05

rare_alleles <- read.table("data/Input/plink.frq", header = T) %>% 
  filter(MAF < 0.05 & MAF > 0) %>% pull(SNP)

# find alleles on the X

X_linked_alleles <- read.table("data/Input/plink.frq", header = T) %>% 
  filter(CHR == 5) %>% pull(SNP)

# and those on autosomes

autosomal_alleles <- read.table("data/Input/plink.frq", header = T) %>% 
  filter(CHR != 5) %>% pull(SNP)

# Get the indexes of variants that are major mutations and also have MAF < 0.05

indexes <- intersect(
  which(bed_file$map$marker.ID %in% all_major_mutations), 
  which(bed_file$map$marker.ID %in% rare_alleles)
)

# Get the indexes of variants that are major mutations and also have MAF < 0.05 and that are on the X

indexes_X <- intersect(
  indexes,
  which(bed_file$map$marker.ID %in% X_linked_alleles)
)

indexes_a <- intersect(
  indexes,
  which(bed_file$map$marker.ID %in% autosomal_alleles)
)

# Function to count the number of mutations in all 205 DGRP lines

count_mutations <- function(indexes){
  tibble(
    line = bed_file$fam$family.ID,
    mutation_count = sapply(
      1:205, function(i) sum(bed_file$genotypes[i, ][indexes] == 2, na.rm = T))
  ) %>% 
    mutate(line = str_remove(line, "_"))
}

mutational_burden <- count_mutations(indexes)

X_burden <- count_mutations(indexes_X)

autosomal_burden <- count_mutations(indexes_a)


# Integrate mutational burden with line lifespan data 

mutational_burden_line_data <-
  left_join(Arya_female_lifespan %>% rename(Female_lifespan = Lifespan),
            Arya_male_lifespan %>% rename(Male_lifespan = Lifespan)) %>% 
  left_join(mutational_burden,
            by = "line")

Run the bayesian linear regression

Code
mutational_burden_line_data <-
  mutational_burden_line_data %>% 
  mutate(Female_lifespan_standard = (Female_lifespan - mean(Female_lifespan))/ sd(Female_lifespan),
         Male_lifespan_standard = (Male_lifespan - mean(Male_lifespan))/ sd(Male_lifespan),
         mutation_count_100 = mutation_count/100)

# run this on individual data
individual_level_data <- 
  Arya_raw_data %>% select(line, Lifespan, Sex) %>% 
  mutate(line = as.character(line),
         line = paste("line", line, sep = "")) %>%
  group_by(Sex) %>% 
  mutate(row = row_number()) %>%
  pivot_wider(names_from = Sex, values_from = Lifespan) %>% 
  select(-row) %>% 
  right_join(mutational_burden, by = "line") %>% 
  rename(Female_lifespan = F, Male_lifespan = M) %>% 
  mutate(Female_lifespan_standard = 
           (Female_lifespan - mean(Female_lifespan, na.rm = T))/ sd(Female_lifespan, na.rm = T),
         Male_lifespan_standard = 
           (Male_lifespan - mean(Male_lifespan, na.rm = T))/ sd(Male_lifespan, na.rm = T),
         mutation_count_100 = mutation_count/100)

# raw data

individual_multivariate_mutational_burden_model <-
  brm(bf(
    mvbind(Female_lifespan, 
           Male_lifespan) ~ mutation_count_100 + (1|line)) + set_rescor(TRUE), 
    data = individual_level_data,
    family = gaussian,
    prior = c(prior(normal(60, 20), class = Intercept, resp = Femalelifespan),
              prior(normal(60, 20), class = Intercept, resp = Malelifespan),
              prior(normal(0, 2), class = b, resp = Femalelifespan),
              prior(normal(0, 2), class = b, resp = Malelifespan),
              prior(normal(1, 1), class = sigma, resp = Femalelifespan),
              prior(normal(1, 1), class = sigma, resp = Malelifespan),
              prior(lkj(2), class = rescor)),
    warmup = 2000, iter = 6000, chains = 4, cores = 4)

# means

multivariate_mutational_burden_model <-
  brm(bf(
    mvbind(Female_lifespan, 
           Male_lifespan) ~ mutation_count_100) + set_rescor(TRUE), 
    data = mutational_burden_line_data,
    family = gaussian,
    prior = c(prior(normal(60, 20), class = Intercept, resp = Femalelifespan),
              prior(normal(60, 20), class = Intercept, resp = Malelifespan),
              prior(normal(0, 2), class = b, resp = Femalelifespan),
              prior(normal(0, 2), class = b, resp = Malelifespan),
              prior(normal(1, 1), class = sigma, resp = Femalelifespan),
              prior(normal(1, 1), class = sigma, resp = Malelifespan),
              prior(lkj(2), class = rescor)),
    warmup = 2000, iter = 6000, chains = 4, cores = 4)


multivariate_mutational_burden_model

Plot results

Code
regression_lines <- conditional_effects(multivariate_mutational_burden_model, plot = F)

regression_lines <-
  bind_rows(
  regression_lines[[1]] %>% 
  as_tibble() %>% 
  select(mutation_count_100, estimate__, se__, lower__, upper__) %>% 
  rename(Posterior_lifespan = estimate__, lower = lower__, upper = upper__) %>% 
  mutate(Sex = "Female"),

  regression_lines[[2]] %>% 
  as_tibble() %>% 
  select(mutation_count_100, estimate__, se__, lower__, upper__) %>% 
  rename(Posterior_lifespan = estimate__, lower = lower__, upper = upper__) %>% 
  mutate(Sex= "Male")
)


mutational_burden_line_data %>%
  select(2:3, mutation_count_100) %>% 
  pivot_longer(1:2, names_to = "Sex",
               values_to = "Posterior_lifespan") %>% 
  mutate(Sex = str_remove(Sex, "_lifespan")) %>% 
ggplot(aes(x = mutation_count_100,
           y = Posterior_lifespan)) +
  geom_point(size = 3) +
  geom_line(data = regression_lines, aes(y = Posterior_lifespan, x = mutation_count_100,
                                         colour = Sex), linewidth = 1.2) +
  geom_ribbon(data = regression_lines, aes(ymin = lower, ymax = upper,
                                           fill = Sex),
              alpha = 0.5) +
  scale_fill_manual(values = c(met.brewer("Hokusai3")[2], met.brewer("Hokusai3")[3])) +
  scale_colour_manual(values = c(met.brewer("Hokusai3")[2], met.brewer("Hokusai3")[3])) +
  facet_wrap(~Sex) +
  labs(x = "No. deleterious mutations (100s)",
       y = "Mean lifespan (days)") +
  theme_bigstatsr() +
  theme(legend.position = "none")